{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from scipy.sparse import csr_matrix,hstack\n",
    "\n",
    "from sklearn.linear_model import Lasso\n",
    "from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer\n",
    "from sklearn.preprocessing import LabelBinarizer\n",
    "from sklearn.model_selection import train_test_split,cross_val_score\n",
    "import lightgbm as lgb\n",
    "import pandas as pd\n",
    "import jieba "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "INPUT_PATH = '../input/'\n",
    "CACHE_PATH = '../cache/'\n",
    "OUTPUT_PATH ='../output/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_train_first = pd.read_csv(INPUT_PATH + 'train_first.csv')\n",
    "df_train_second = pd.read_csv(INPUT_PATH +'train_second.csv')\n",
    "df_test_first = pd.read_csv(INPUT_PATH+'predict_first.csv')\n",
    "df_test_second = pd.read_csv(INPUT_PATH + 'predict_second.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_train = pd.concat([df_train_first,df_train_second],axis = 0,ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_test = df_test_second.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Discuss</th>\n",
       "      <th>Score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>201e8bf2-77a2-3a98-9fcf-4ce03914e712</td>\n",
       "      <td>好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>f4d51947-eac4-3005-9d3c-2f32d6068a2d</td>\n",
       "      <td>新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>74aa7ae4-03a4-394c-bee0-5702d3a3082a</td>\n",
       "      <td>庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>099661c2-4360-3c49-a2fe-8c783764f7db</td>\n",
       "      <td>个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>97ca672d-e558-3542-ba7b-ee719bba1bab</td>\n",
       "      <td>迪斯尼一日游</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3b7f3f2e-886f-3a68-a810-2c37cfd728d3</td>\n",
       "      <td>方便</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>88914409-bd13-3d47-b5a2-691177dde8fd</td>\n",
       "      <td>看水看山都可以。感受古人的智慧结晶，秋景美丽如画，红黄绿相间！对于身体状况不佳的人来说，走平...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>bf13ec92-6079-3451-ade3-88020cb0dcb5</td>\n",
       "      <td>赞</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>489c3d94-9c44-3cf2-949c-1b507c374c69</td>\n",
       "      <td>唯一糟点</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>285bba78-16a3-3c1d-b648-baa483883ee3</td>\n",
       "      <td>周未周边游</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>e7801d96-73d0-35c4-9e00-cc15caaa384a</td>\n",
       "      <td>景点服务不错，就是排队 太长了，好玩的项目都是人，晚上的烟火一定jrvytqlamf要看，真...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>973afeca-7530-3f56-b7f5-bef36d889025</td>\n",
       "      <td>绍兴护城河夜游</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>cd91dc2f-2331-3c73-bc8d-da027337270d</td>\n",
       "      <td>感觉还不错，作为一日游不错的选择～</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>7ce97eca-63a8-30a1-9687-6796f34606f1</td>\n",
       "      <td>有趣hai xing</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>25e21097-bd41-3589-b12c-62bc7b04eb6d</td>\n",
       "      <td>荡气回肠，10年去的，居然没有留下来照片，必然要再去！&lt;br /&gt;n</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>98e78de7-d5d3-3b30-90d4-a63a6107d532</td>\n",
       "      <td>景色超级棒，有美丽的故事，可以乘船游览，也可以沿湖浏览，累了可以乘坐观光车！关键是没有门票！！！</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>26334fc8-a4f1-3dc3-adb6-76b99d75cdf9</td>\n",
       "      <td>南锣鼓巷是北京市中心一条老胡同，因为其地理位置靠近什刹海，成为北京休闲娱乐的好去处，特别是外...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>7f4d6d59-f732-3125-8e7d-8bd64c891b94</td>\n",
       "      <td>个人感觉就是个卖小商品的地方，还不便宜，但是晚上夜景挺好看</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>61522e3c-5d2a-3088-b60d-159dbc2976ce</td>\n",
       "      <td>性价比超高</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>37e57244-8d7e-3a1d-8f0f-8b811afb4a6a</td>\n",
       "      <td>挺普通的吧，就在楼下拍了几张图片，反正也是进不去的呵</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>81502f08-b884-38b8-8169-7de7a0680a82</td>\n",
       "      <td>太大了，在里面走了好长时间也就看了不到五分之一。但周围交通方便，值得去看一看</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>533a667c-d6ba-313d-bc29-588b992789e0</td>\n",
       "      <td>迪士尼</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>041e4056-62f2-3f25-8e3e-8f57f66cb3d8</td>\n",
       "      <td>亲子游</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>988f2319-3292-305a-aaaf-ba18bc397e5a</td>\n",
       "      <td>来苏州总是要欣赏一下古典园林的。可惜对园林不太感冒。逛逛玩玩还是不错的。</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>c3d7dd21-79ef-3ff2-b90d-14631e9a30b4</td>\n",
       "      <td>不到长城非好汉，对于爬过华山的我来说，长城太简单了，值得一去。</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>4b12c7b9-059f-3016-a954-3849b0456ce4</td>\n",
       "      <td>很值得去的地方</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>5ba1fa45-4c97-3afe-9dd6-3efea9c73a94</td>\n",
       "      <td>第一次必到景点</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>f6d82a8c-ef72-3a0a-95ca-95aa2fbb7f7d</td>\n",
       "      <td>好歹也是长城</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>9363fc36-92a7-371f-8d5f-5dd71b565455</td>\n",
       "      <td>早上一大早就起床去看升国旗，很庄严，很整齐，就是像一个节目让人转不开眼睛，特别是老辈的人听说...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>a4dc34f1-6a97-3b86-829c-466cdaa86bf2</td>\n",
       "      <td>登顶是俯瞰故宫的绝佳之处，崇祯帝在此自缢！</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219970</th>\n",
       "      <td>5922e3e1-6812-3784-8b50-55239107c78b</td>\n",
       "      <td>挺不错的，值得去  ，喜欢那慵懒的阳光</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219971</th>\n",
       "      <td>9302ede6-40b6-32b4-8e71-b999e41e7060</td>\n",
       "      <td>不错，满意</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219972</th>\n",
       "      <td>6e37c76c-a7d7-3c87-90d1-d04068a9b663</td>\n",
       "      <td>美美美，景色太美啦！路上还遇到很多次可爱的小松鼠。海拔较高，略冷，我们夏天去的，进去前还有租...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219973</th>\n",
       "      <td>c9f39ce6-a44c-3ba1-a562-d17dece83da8</td>\n",
       "      <td>AAAAA景区，很不错，必去！入口是新建的，里面的很漂亮</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219974</th>\n",
       "      <td>2a09fc7b-40b1-3099-bf2b-7f9638f80c04</td>\n",
       "      <td>在上面泛舟的话，别有一番趣味哟~</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219975</th>\n",
       "      <td>42acbeda-6aa8-3d40-9d91-c0153e161f15</td>\n",
       "      <td>无聊</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219976</th>\n",
       "      <td>5bb78617-1c70-3fd0-9135-cd236ac8e722</td>\n",
       "      <td>还可以</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219977</th>\n",
       "      <td>c3200513-3cca-3ada-84d8-a3266ffb102c</td>\n",
       "      <td>地方不大 里面有义务的大学生讲解员！！！！！</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219978</th>\n",
       "      <td>0f81e581-c187-3dfb-b863-58ef9689af01</td>\n",
       "      <td>门票不便宜，风景还不错，远眺长江</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219979</th>\n",
       "      <td>44d4da48-0add-365e-ac24-c0f4185acf62</td>\n",
       "      <td>五大景点没有都去，但是这个价位的联票真的特别特别值</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219980</th>\n",
       "      <td>ce4b0952-5cfc-3a55-b56f-9e876393943d</td>\n",
       "      <td>宝峰湖是高山上的一个湖泊。来到宝峰湖外的山脚下，美丽的苗族女孩身穿节日盛装，游人可以花点小钱...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219981</th>\n",
       "      <td>8f78996b-a83d-38a1-98b6-87d1684894db</td>\n",
       "      <td>绍兴周边游</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219982</th>\n",
       "      <td>1349fadc-3816-32db-a36a-89fdcdc591a6</td>\n",
       "      <td>只要去北京都会去的地方  故宫不要错过哦</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219983</th>\n",
       "      <td>11d0bfca-a3a7-390e-a3be-655d58bf40a8</td>\n",
       "      <td>五一长假我和妻子一起开车来玩的垂云通天河 我们游玩了溶洞 里面也凉爽 外面太热就没怎么玩 到...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219984</th>\n",
       "      <td>9dc31c54-55ab-30d4-b1f8-f31ea9e82166</td>\n",
       "      <td>性价比一般般</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219985</th>\n",
       "      <td>c34ed4e8-5d95-37a2-986f-0295613e298c</td>\n",
       "      <td>非常好的一次游玩</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219986</th>\n",
       "      <td>9d8f492d-f738-32a7-89e8-1be7560e14e4</td>\n",
       "      <td>景色很好，古色古香，绍兴特色。&lt;br /&gt;n</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219987</th>\n",
       "      <td>2b453f26-8b2d-3bda-b77d-5524ea2d4a96</td>\n",
       "      <td>不到长城非好汉！在德胜门坐877很方便，直达景区！（我是在北京玩了两天，第一天前门、故宫、景...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219988</th>\n",
       "      <td>fa8d661c-5b00-349e-96c3-3644f11f08fb</td>\n",
       "      <td>一大早就被我朋友拉到药王山看猴子啦，我还拍了照，就迫不及待的给大家看下啦</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219989</th>\n",
       "      <td>dec8e07a-805e-3ca7-99d8-afcbe7ef3045</td>\n",
       "      <td>就是故宫的正门，俗称南门，始建于明朝永乐年间。午门南是天安门，北边是太和门。午门是颁发皇帝诏...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219990</th>\n",
       "      <td>12197df1-9a80-30d8-aaf1-176210492080</td>\n",
       "      <td>“不去黄龙终生遗憾，去了黄龙遗憾终生”，大家觉得这句话特别贴切，可是本人倒是没觉得，我是喜欢...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219991</th>\n",
       "      <td>7c200121-3635-363f-acbd-c639fde31d2d</td>\n",
       "      <td>4月30日，6:40取票后乘金沙索道上山。景色太漂亮啦！奇峰怪石、鬼斧神工，造型奇特，气势雄...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219992</th>\n",
       "      <td>b26c9785-c731-36a6-a636-65ed413b1279</td>\n",
       "      <td>文艺青年的聚集地。这里总会有一些画展，喜欢艺术的朋友可以来看看。</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219993</th>\n",
       "      <td>5d5807bb-0e0d-39c1-b231-5ff3823a919e</td>\n",
       "      <td>千岛湖风景还是不错的，只是上岛不能自己选择余地，只有确定的几个路线，有些岛不是很好玩。</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219994</th>\n",
       "      <td>330266b3-c05d-3a17-955f-9aba05a3671d</td>\n",
       "      <td>人太多了</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219995</th>\n",
       "      <td>c5e29d46-ea63-3610-aba3-06fe94e251d1</td>\n",
       "      <td>沙漠中的奇迹，历经千年而不衰。鸣沙山若没有了月牙泉，该是失色不少吧。</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219996</th>\n",
       "      <td>73dac581-85af-3700-b5b3-2f01f5297718</td>\n",
       "      <td>“双龙海”在芦苇海之上，树正群海的瀑布群之下，透过晶莹的湖水，可以看到海中有两条带状的生物钙...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219997</th>\n",
       "      <td>b17ee6da-9dcd-3b47-8ad2-066b3ae5c698</td>\n",
       "      <td>很累吧</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219998</th>\n",
       "      <td>f6c93407-5128-3f18-b7a1-d0058839df3e</td>\n",
       "      <td>东华门是北京紫禁城东门，紫禁城四门之一，与西华门相对。</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219999</th>\n",
       "      <td>c12612cc-733a-368b-935e-eb702106a9e2</td>\n",
       "      <td>取票很快，三清山风景秀丽迷人，妈妈网订票方便，下次还要选择这种方式订票！</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>220000 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          Id  \\\n",
       "0       201e8bf2-77a2-3a98-9fcf-4ce03914e712   \n",
       "1       f4d51947-eac4-3005-9d3c-2f32d6068a2d   \n",
       "2       74aa7ae4-03a4-394c-bee0-5702d3a3082a   \n",
       "3       099661c2-4360-3c49-a2fe-8c783764f7db   \n",
       "4       97ca672d-e558-3542-ba7b-ee719bba1bab   \n",
       "5       3b7f3f2e-886f-3a68-a810-2c37cfd728d3   \n",
       "6       88914409-bd13-3d47-b5a2-691177dde8fd   \n",
       "7       bf13ec92-6079-3451-ade3-88020cb0dcb5   \n",
       "8       489c3d94-9c44-3cf2-949c-1b507c374c69   \n",
       "9       285bba78-16a3-3c1d-b648-baa483883ee3   \n",
       "10      e7801d96-73d0-35c4-9e00-cc15caaa384a   \n",
       "11      973afeca-7530-3f56-b7f5-bef36d889025   \n",
       "12      cd91dc2f-2331-3c73-bc8d-da027337270d   \n",
       "13      7ce97eca-63a8-30a1-9687-6796f34606f1   \n",
       "14      25e21097-bd41-3589-b12c-62bc7b04eb6d   \n",
       "15      98e78de7-d5d3-3b30-90d4-a63a6107d532   \n",
       "16      26334fc8-a4f1-3dc3-adb6-76b99d75cdf9   \n",
       "17      7f4d6d59-f732-3125-8e7d-8bd64c891b94   \n",
       "18      61522e3c-5d2a-3088-b60d-159dbc2976ce   \n",
       "19      37e57244-8d7e-3a1d-8f0f-8b811afb4a6a   \n",
       "20      81502f08-b884-38b8-8169-7de7a0680a82   \n",
       "21      533a667c-d6ba-313d-bc29-588b992789e0   \n",
       "22      041e4056-62f2-3f25-8e3e-8f57f66cb3d8   \n",
       "23      988f2319-3292-305a-aaaf-ba18bc397e5a   \n",
       "24      c3d7dd21-79ef-3ff2-b90d-14631e9a30b4   \n",
       "25      4b12c7b9-059f-3016-a954-3849b0456ce4   \n",
       "26      5ba1fa45-4c97-3afe-9dd6-3efea9c73a94   \n",
       "27      f6d82a8c-ef72-3a0a-95ca-95aa2fbb7f7d   \n",
       "28      9363fc36-92a7-371f-8d5f-5dd71b565455   \n",
       "29      a4dc34f1-6a97-3b86-829c-466cdaa86bf2   \n",
       "...                                      ...   \n",
       "219970  5922e3e1-6812-3784-8b50-55239107c78b   \n",
       "219971  9302ede6-40b6-32b4-8e71-b999e41e7060   \n",
       "219972  6e37c76c-a7d7-3c87-90d1-d04068a9b663   \n",
       "219973  c9f39ce6-a44c-3ba1-a562-d17dece83da8   \n",
       "219974  2a09fc7b-40b1-3099-bf2b-7f9638f80c04   \n",
       "219975  42acbeda-6aa8-3d40-9d91-c0153e161f15   \n",
       "219976  5bb78617-1c70-3fd0-9135-cd236ac8e722   \n",
       "219977  c3200513-3cca-3ada-84d8-a3266ffb102c   \n",
       "219978  0f81e581-c187-3dfb-b863-58ef9689af01   \n",
       "219979  44d4da48-0add-365e-ac24-c0f4185acf62   \n",
       "219980  ce4b0952-5cfc-3a55-b56f-9e876393943d   \n",
       "219981  8f78996b-a83d-38a1-98b6-87d1684894db   \n",
       "219982  1349fadc-3816-32db-a36a-89fdcdc591a6   \n",
       "219983  11d0bfca-a3a7-390e-a3be-655d58bf40a8   \n",
       "219984  9dc31c54-55ab-30d4-b1f8-f31ea9e82166   \n",
       "219985  c34ed4e8-5d95-37a2-986f-0295613e298c   \n",
       "219986  9d8f492d-f738-32a7-89e8-1be7560e14e4   \n",
       "219987  2b453f26-8b2d-3bda-b77d-5524ea2d4a96   \n",
       "219988  fa8d661c-5b00-349e-96c3-3644f11f08fb   \n",
       "219989  dec8e07a-805e-3ca7-99d8-afcbe7ef3045   \n",
       "219990  12197df1-9a80-30d8-aaf1-176210492080   \n",
       "219991  7c200121-3635-363f-acbd-c639fde31d2d   \n",
       "219992  b26c9785-c731-36a6-a636-65ed413b1279   \n",
       "219993  5d5807bb-0e0d-39c1-b231-5ff3823a919e   \n",
       "219994  330266b3-c05d-3a17-955f-9aba05a3671d   \n",
       "219995  c5e29d46-ea63-3610-aba3-06fe94e251d1   \n",
       "219996  73dac581-85af-3700-b5b3-2f01f5297718   \n",
       "219997  b17ee6da-9dcd-3b47-8ad2-066b3ae5c698   \n",
       "219998  f6c93407-5128-3f18-b7a1-d0058839df3e   \n",
       "219999  c12612cc-733a-368b-935e-eb702106a9e2   \n",
       "\n",
       "                                                  Discuss  Score  \n",
       "0                   好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的      5  \n",
       "1                         新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！      4  \n",
       "2                     庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去      4  \n",
       "3       个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...      5  \n",
       "4                                                  迪斯尼一日游      5  \n",
       "5                                                      方便      4  \n",
       "6       看水看山都可以。感受古人的智慧结晶，秋景美丽如画，红黄绿相间！对于身体状况不佳的人来说，走平...      4  \n",
       "7                                                       赞      5  \n",
       "8                                                    唯一糟点      5  \n",
       "9                                                   周未周边游      5  \n",
       "10      景点服务不错，就是排队 太长了，好玩的项目都是人，晚上的烟火一定jrvytqlamf要看，真...      5  \n",
       "11                                                绍兴护城河夜游      4  \n",
       "12                                      感觉还不错，作为一日游不错的选择～      5  \n",
       "13                                             有趣hai xing      5  \n",
       "14                     荡气回肠，10年去的，居然没有留下来照片，必然要再去！<br />n      5  \n",
       "15       景色超级棒，有美丽的故事，可以乘船游览，也可以沿湖浏览，累了可以乘坐观光车！关键是没有门票！！！      5  \n",
       "16      南锣鼓巷是北京市中心一条老胡同，因为其地理位置靠近什刹海，成为北京休闲娱乐的好去处，特别是外...      5  \n",
       "17                          个人感觉就是个卖小商品的地方，还不便宜，但是晚上夜景挺好看      3  \n",
       "18                                                  性价比超高      5  \n",
       "19                             挺普通的吧，就在楼下拍了几张图片，反正也是进不去的呵      3  \n",
       "20                 太大了，在里面走了好长时间也就看了不到五分之一。但周围交通方便，值得去看一看      4  \n",
       "21                                                    迪士尼      2  \n",
       "22                                                    亲子游      5  \n",
       "23                   来苏州总是要欣赏一下古典园林的。可惜对园林不太感冒。逛逛玩玩还是不错的。      4  \n",
       "24                        不到长城非好汉，对于爬过华山的我来说，长城太简单了，值得一去。      5  \n",
       "25                                                很值得去的地方      5  \n",
       "26                                                第一次必到景点      5  \n",
       "27                                                 好歹也是长城      4  \n",
       "28      早上一大早就起床去看升国旗，很庄严，很整齐，就是像一个节目让人转不开眼睛，特别是老辈的人听说...      5  \n",
       "29                                  登顶是俯瞰故宫的绝佳之处，崇祯帝在此自缢！      4  \n",
       "...                                                   ...    ...  \n",
       "219970                                挺不错的，值得去  ，喜欢那慵懒的阳光      4  \n",
       "219971                                              不错，满意      5  \n",
       "219972  美美美，景色太美啦！路上还遇到很多次可爱的小松鼠。海拔较高，略冷，我们夏天去的，进去前还有租...      5  \n",
       "219973                       AAAAA景区，很不错，必去！入口是新建的，里面的很漂亮      5  \n",
       "219974                                   在上面泛舟的话，别有一番趣味哟~      4  \n",
       "219975                                                 无聊      5  \n",
       "219976                                                还可以      5  \n",
       "219977                             地方不大 里面有义务的大学生讲解员！！！！！      5  \n",
       "219978                                   门票不便宜，风景还不错，远眺长江      4  \n",
       "219979                          五大景点没有都去，但是这个价位的联票真的特别特别值      5  \n",
       "219980  宝峰湖是高山上的一个湖泊。来到宝峰湖外的山脚下，美丽的苗族女孩身穿节日盛装，游人可以花点小钱...      5  \n",
       "219981                                              绍兴周边游      4  \n",
       "219982                               只要去北京都会去的地方  故宫不要错过哦      5  \n",
       "219983  五一长假我和妻子一起开车来玩的垂云通天河 我们游玩了溶洞 里面也凉爽 外面太热就没怎么玩 到...      5  \n",
       "219984                                             性价比一般般      3  \n",
       "219985                                           非常好的一次游玩      5  \n",
       "219986                             景色很好，古色古香，绍兴特色。<br />n      5  \n",
       "219987  不到长城非好汉！在德胜门坐877很方便，直达景区！（我是在北京玩了两天，第一天前门、故宫、景...      4  \n",
       "219988               一大早就被我朋友拉到药王山看猴子啦，我还拍了照，就迫不及待的给大家看下啦      5  \n",
       "219989  就是故宫的正门，俗称南门，始建于明朝永乐年间。午门南是天安门，北边是太和门。午门是颁发皇帝诏...      5  \n",
       "219990  “不去黄龙终生遗憾，去了黄龙遗憾终生”，大家觉得这句话特别贴切，可是本人倒是没觉得，我是喜欢...      5  \n",
       "219991  4月30日，6:40取票后乘金沙索道上山。景色太漂亮啦！奇峰怪石、鬼斧神工，造型奇特，气势雄...      5  \n",
       "219992                   文艺青年的聚集地。这里总会有一些画展，喜欢艺术的朋友可以来看看。      4  \n",
       "219993        千岛湖风景还是不错的，只是上岛不能自己选择余地，只有确定的几个路线，有些岛不是很好玩。      4  \n",
       "219994                                               人太多了      5  \n",
       "219995                 沙漠中的奇迹，历经千年而不衰。鸣沙山若没有了月牙泉，该是失色不少吧。      4  \n",
       "219996  “双龙海”在芦苇海之上，树正群海的瀑布群之下，透过晶莹的湖水，可以看到海中有两条带状的生物钙...      5  \n",
       "219997                                                很累吧      4  \n",
       "219998                        东华门是北京紫禁城东门，紫禁城四门之一，与西华门相对。      4  \n",
       "219999               取票很快，三清山风景秀丽迷人，妈妈网订票方便，下次还要选择这种方式订票！      5  \n",
       "\n",
       "[220000 rows x 3 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n",
      "Loading model cost 0.561 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "df_train[\"comment\"] = df_train['Discuss'].apply(lambda x : ' '.join(jieba.cut(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_test['comment'] = df_test['Discuss'].apply(lambda x: ' '.join(jieba.cut(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# vectorizer = CountVectorizer(ngram_range=(1,8),token_pattern=r'(?u)\\b\\w+\\b', min_df=1)\n",
    "# anlyze = vectorizer.build_analyzer()\n",
    "# for f in anlyze(' '.join(jieba.cut('新中国成立也是在这举行'))):\n",
    "#     print (f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_date  = pd.concat([df_train,df_test],axis =0,ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Discuss</th>\n",
       "      <th>Id</th>\n",
       "      <th>Score</th>\n",
       "      <th>comment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的</td>\n",
       "      <td>201e8bf2-77a2-3a98-9fcf-4ce03914e712</td>\n",
       "      <td>5.0</td>\n",
       "      <td>好大 的 一个 游乐 公园 ， 已经 去 了 2 次 ， 但 感觉 还 没有 玩够 似的 ！...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！</td>\n",
       "      <td>f4d51947-eac4-3005-9d3c-2f32d6068a2d</td>\n",
       "      <td>4.0</td>\n",
       "      <td>新 中国 成立 也 是 在 这 举行 ， 对 我们 中国 人 来说 有些 重要 及 深刻 的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去</td>\n",
       "      <td>74aa7ae4-03a4-394c-bee0-5702d3a3082a</td>\n",
       "      <td>4.0</td>\n",
       "      <td>庐山 瀑布 非常 有名 ， 也 有 非常 多个 瀑布 ， 只是 最 好看 的 非 三叠 泉莫...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...</td>\n",
       "      <td>099661c2-4360-3c49-a2fe-8c783764f7db</td>\n",
       "      <td>5.0</td>\n",
       "      <td>个人 觉得 颐和园 是 北京 最值 的 一起 的 地方 ， 不过 相比 下 门票 也 是 最...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>迪斯尼一日游</td>\n",
       "      <td>97ca672d-e558-3542-ba7b-ee719bba1bab</td>\n",
       "      <td>5.0</td>\n",
       "      <td>迪斯尼 一日游</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>方便</td>\n",
       "      <td>3b7f3f2e-886f-3a68-a810-2c37cfd728d3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>方便</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>看水看山都可以。感受古人的智慧结晶，秋景美丽如画，红黄绿相间！对于身体状况不佳的人来说，走平...</td>\n",
       "      <td>88914409-bd13-3d47-b5a2-691177dde8fd</td>\n",
       "      <td>4.0</td>\n",
       "      <td>看水 看山 都 可以 。 感受 古人 的 智慧结晶 ， 秋景 美丽 如画 ， 红黄绿 相间 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>赞</td>\n",
       "      <td>bf13ec92-6079-3451-ade3-88020cb0dcb5</td>\n",
       "      <td>5.0</td>\n",
       "      <td>赞</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>唯一糟点</td>\n",
       "      <td>489c3d94-9c44-3cf2-949c-1b507c374c69</td>\n",
       "      <td>5.0</td>\n",
       "      <td>唯一 糟点</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>周未周边游</td>\n",
       "      <td>285bba78-16a3-3c1d-b648-baa483883ee3</td>\n",
       "      <td>5.0</td>\n",
       "      <td>周未 周边游</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>景点服务不错，就是排队 太长了，好玩的项目都是人，晚上的烟火一定jrvytqlamf要看，真...</td>\n",
       "      <td>e7801d96-73d0-35c4-9e00-cc15caaa384a</td>\n",
       "      <td>5.0</td>\n",
       "      <td>景点 服务 不错 ， 就是 排队   太长 了 ， 好玩 的 项目 都 是 人 ， 晚上 的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>绍兴护城河夜游</td>\n",
       "      <td>973afeca-7530-3f56-b7f5-bef36d889025</td>\n",
       "      <td>4.0</td>\n",
       "      <td>绍兴 护城河 夜游</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>感觉还不错，作为一日游不错的选择～</td>\n",
       "      <td>cd91dc2f-2331-3c73-bc8d-da027337270d</td>\n",
       "      <td>5.0</td>\n",
       "      <td>感觉 还 不错 ， 作为 一日游 不错 的 选择 ～</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>有趣hai xing</td>\n",
       "      <td>7ce97eca-63a8-30a1-9687-6796f34606f1</td>\n",
       "      <td>5.0</td>\n",
       "      <td>有趣 hai   xing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>荡气回肠，10年去的，居然没有留下来照片，必然要再去！&lt;br /&gt;n</td>\n",
       "      <td>25e21097-bd41-3589-b12c-62bc7b04eb6d</td>\n",
       "      <td>5.0</td>\n",
       "      <td>荡气回肠 ， 10 年 去 的 ， 居然 没有 留下来 照片 ， 必然 要 再 去 ！ &lt; ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>景色超级棒，有美丽的故事，可以乘船游览，也可以沿湖浏览，累了可以乘坐观光车！关键是没有门票！！！</td>\n",
       "      <td>98e78de7-d5d3-3b30-90d4-a63a6107d532</td>\n",
       "      <td>5.0</td>\n",
       "      <td>景色 超级 棒 ， 有 美丽 的 故事 ， 可以 乘船 游览 ， 也 可以 沿湖 浏览 ， ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>南锣鼓巷是北京市中心一条老胡同，因为其地理位置靠近什刹海，成为北京休闲娱乐的好去处，特别是外...</td>\n",
       "      <td>26334fc8-a4f1-3dc3-adb6-76b99d75cdf9</td>\n",
       "      <td>5.0</td>\n",
       "      <td>南锣鼓巷 是 北京市 中心 一条 老 胡同 ， 因为 其 地理位置 靠近 什刹海 ， 成为 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>个人感觉就是个卖小商品的地方，还不便宜，但是晚上夜景挺好看</td>\n",
       "      <td>7f4d6d59-f732-3125-8e7d-8bd64c891b94</td>\n",
       "      <td>3.0</td>\n",
       "      <td>个人感觉 就是 个 卖 小商品 的 地方 ， 还 不 便宜 ， 但是 晚上 夜景 挺 好看</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>性价比超高</td>\n",
       "      <td>61522e3c-5d2a-3088-b60d-159dbc2976ce</td>\n",
       "      <td>5.0</td>\n",
       "      <td>性价比 超高</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>挺普通的吧，就在楼下拍了几张图片，反正也是进不去的呵</td>\n",
       "      <td>37e57244-8d7e-3a1d-8f0f-8b811afb4a6a</td>\n",
       "      <td>3.0</td>\n",
       "      <td>挺 普通 的 吧 ， 就 在 楼下 拍 了 几张 图片 ， 反正 也 是 进不去 的 呵</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>太大了，在里面走了好长时间也就看了不到五分之一。但周围交通方便，值得去看一看</td>\n",
       "      <td>81502f08-b884-38b8-8169-7de7a0680a82</td>\n",
       "      <td>4.0</td>\n",
       "      <td>太大 了 ， 在 里面 走 了 好长时间 也 就 看 了 不到 五分之一 。 但 周围 交通...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>迪士尼</td>\n",
       "      <td>533a667c-d6ba-313d-bc29-588b992789e0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>迪士尼</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>亲子游</td>\n",
       "      <td>041e4056-62f2-3f25-8e3e-8f57f66cb3d8</td>\n",
       "      <td>5.0</td>\n",
       "      <td>亲子 游</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>来苏州总是要欣赏一下古典园林的。可惜对园林不太感冒。逛逛玩玩还是不错的。</td>\n",
       "      <td>988f2319-3292-305a-aaaf-ba18bc397e5a</td>\n",
       "      <td>4.0</td>\n",
       "      <td>来 苏州 总是 要 欣赏 一下 古典 园林 的 。 可惜 对 园林 不太 感冒 。 逛逛 玩...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>不到长城非好汉，对于爬过华山的我来说，长城太简单了，值得一去。</td>\n",
       "      <td>c3d7dd21-79ef-3ff2-b90d-14631e9a30b4</td>\n",
       "      <td>5.0</td>\n",
       "      <td>不到长城非好汉 ， 对于 爬 过 华山 的 我 来说 ， 长城 太 简单 了 ， 值得 一去 。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>很值得去的地方</td>\n",
       "      <td>4b12c7b9-059f-3016-a954-3849b0456ce4</td>\n",
       "      <td>5.0</td>\n",
       "      <td>很 值得 去 的 地方</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>第一次必到景点</td>\n",
       "      <td>5ba1fa45-4c97-3afe-9dd6-3efea9c73a94</td>\n",
       "      <td>5.0</td>\n",
       "      <td>第一次 必到 景点</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>好歹也是长城</td>\n",
       "      <td>f6d82a8c-ef72-3a0a-95ca-95aa2fbb7f7d</td>\n",
       "      <td>4.0</td>\n",
       "      <td>好歹 也 是 长城</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>早上一大早就起床去看升国旗，很庄严，很整齐，就是像一个节目让人转不开眼睛，特别是老辈的人听说...</td>\n",
       "      <td>9363fc36-92a7-371f-8d5f-5dd71b565455</td>\n",
       "      <td>5.0</td>\n",
       "      <td>早上 一大早 就 起床 去 看 升国旗 ， 很 庄严 ， 很 整齐 ， 就是 像 一个 节目...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>登顶是俯瞰故宫的绝佳之处，崇祯帝在此自缢！</td>\n",
       "      <td>a4dc34f1-6a97-3b86-829c-466cdaa86bf2</td>\n",
       "      <td>4.0</td>\n",
       "      <td>登顶 是 俯瞰 故宫 的 绝佳 之 处 ， 崇祯帝 在 此 自缢 ！</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269970</th>\n",
       "      <td>风景优美</td>\n",
       "      <td>75e3f467-ee98-3dcc-812f-72f7ab6bf80c</td>\n",
       "      <td>NaN</td>\n",
       "      <td>风景优美</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269971</th>\n",
       "      <td>鼓浪屿是我在国内玩过的景点里比较不错的一个  去厦门是一定要去鼓浪屿的  有美食一条街 还有...</td>\n",
       "      <td>78039063-aa5d-34ab-a1a6-00d49ff6a16f</td>\n",
       "      <td>NaN</td>\n",
       "      <td>鼓浪屿 是 我 在 国内 玩过 的 景点 里 比较 不错 的 一个     去 厦门 是 一...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269972</th>\n",
       "      <td>这个没啥看头，就是会动而已。。</td>\n",
       "      <td>f8aec5d7-b10c-3005-891a-3e353765cea0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>这个 没 啥 看头 ， 就是 会 动 而已 。 。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269973</th>\n",
       "      <td>灵山正如它的名字一样，令人身临其境感到一股灵气，林中鲜脆的树叶及悦耳的鸟叫，让人感觉身心放松...</td>\n",
       "      <td>a056a723-7270-3bf3-ba48-b590d2367f33</td>\n",
       "      <td>NaN</td>\n",
       "      <td>灵山 正如 它 的 名字 一样 ， 令人 身临其境 感到 一股 灵气 ， 林中 鲜脆 的 树...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269974</th>\n",
       "      <td>很好的景区，就是天公不作美</td>\n",
       "      <td>fb7dfd18-b884-3bb0-88da-f6a91aa2908e</td>\n",
       "      <td>NaN</td>\n",
       "      <td>很 好 的 景区 ， 就是 天公不作美</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269975</th>\n",
       "      <td>感觉还可以，也就是进去看个景色。</td>\n",
       "      <td>61c44262-2ba0-3764-b2fa-034afa11ba63</td>\n",
       "      <td>NaN</td>\n",
       "      <td>感觉 还 可以 ， 也 就是 进去 看个 景色 。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269976</th>\n",
       "      <td>看到了上海的地标建筑，很开心</td>\n",
       "      <td>168ffccc-97f2-3292-89f1-b5e69e2b0342</td>\n",
       "      <td>NaN</td>\n",
       "      <td>看到 了 上海 的 地标 建筑 ， 很 开心</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269977</th>\n",
       "      <td>因为宝对迪士尼的大部分经典动画都了解，所以整个体验相当不错</td>\n",
       "      <td>11d09de4-6b44-384e-b2ab-e09c70ccb7d2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>因为 宝对 迪士尼 的 大部分 经典 动画 都 了解 ， 所以 整个 体验 相当 不错</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269978</th>\n",
       "      <td>很雄伟的宫殿，因为拉萨海拔太高了，没有选择爬上去，下次再来争取能爬上去看看，据说里面也很大</td>\n",
       "      <td>ba3329d8-5fc2-3b70-bd63-131797230176</td>\n",
       "      <td>NaN</td>\n",
       "      <td>很 雄伟 的 宫殿 ， 因为 拉萨 海拔 太高 了 ， 没有 选择 爬上去 ， 下次 再 来...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269979</th>\n",
       "      <td>不错挺方便</td>\n",
       "      <td>f3d2ed80-f677-3d2b-b06d-d2b627ecb89e</td>\n",
       "      <td>NaN</td>\n",
       "      <td>不错 挺 方便</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269980</th>\n",
       "      <td>景区内十大绝景之一，外形非常之像！还有美丽的爱情故事，令人神往！</td>\n",
       "      <td>09d81155-92f8-3a1b-a96c-18a60ddb0af5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>景区 内 十大 绝景 之一 ， 外形 非常 之像 ！ 还有 美丽 的 爱情故事 ， 令人神往 ！</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269981</th>\n",
       "      <td>还算不错的，很热闹。每天都会有，晚上场比较震撼&lt;br /&gt;n</td>\n",
       "      <td>180e945e-7b9a-3926-a0d5-c5d37adc6397</td>\n",
       "      <td>NaN</td>\n",
       "      <td>还 算 不错 的 ， 很 热闹 。 每天 都 会 有 ， 晚上 场 比较 震撼 &lt; br  ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269982</th>\n",
       "      <td>用心去感受这皇家园林博物馆，如果要去细细的品味须要一天的时间。</td>\n",
       "      <td>48f19b81-9830-342c-9b56-5d16461aaf59</td>\n",
       "      <td>NaN</td>\n",
       "      <td>用心 去 感受 这 皇家 园林 博物馆 ， 如果 要 去 细细的 品味 须要 一天 的 时间 。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269983</th>\n",
       "      <td>之前东湖是我国第一大城中湖，不过现在是第二大了，武汉汤逊湖现在是第一大城中湖。东湖也是世界三...</td>\n",
       "      <td>d3e78b57-56f3-3477-8212-6375541c4c52</td>\n",
       "      <td>NaN</td>\n",
       "      <td>之前 东湖 是 我国 第一 大 城中 湖 ， 不过 现在 是 第二 大 了 ， 武汉 汤逊湖...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269984</th>\n",
       "      <td>去的时候刚好在检修 整个大雁塔都被包住了而且不能登上去 光以一个景区来说 并不是那么那么的惊...</td>\n",
       "      <td>4aebdab0-77b1-3c66-9073-d6b620218825</td>\n",
       "      <td>NaN</td>\n",
       "      <td>去 的 时候 刚好 在 检修   整个 大雁塔 都 被 包住 了 而且 不能 登上 去   ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269985</th>\n",
       "      <td>景色宜人环境优美价格便宜服务周到值。</td>\n",
       "      <td>e6eb2ce4-08ec-3574-915c-fd9086a84c74</td>\n",
       "      <td>NaN</td>\n",
       "      <td>景色宜人 环境优美 价格便宜 服务周到 值 。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269986</th>\n",
       "      <td>门票真的很方便，都不用排队了，到那就可以玩啦，景色真的很好</td>\n",
       "      <td>79060b6f-647c-35a7-899e-72aba4d1a40d</td>\n",
       "      <td>NaN</td>\n",
       "      <td>门票 真的 很 方便 ， 都 不用 排队 了 ， 到 那 就 可以 玩 啦 ， 景色 真的 很 好</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269987</th>\n",
       "      <td>适合亲子</td>\n",
       "      <td>e4522edd-00dc-3499-a03c-1e37bdb13934</td>\n",
       "      <td>NaN</td>\n",
       "      <td>适合 亲子</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269988</th>\n",
       "      <td>迪士尼之行</td>\n",
       "      <td>4a9c2aa1-7d33-3c9b-a767-e0deff7b82dd</td>\n",
       "      <td>NaN</td>\n",
       "      <td>迪士尼 之 行</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269989</th>\n",
       "      <td>12年拍摄，拿到第一步单反后留下的照片。值得纪念，那次走野路爬了香山，从山上下到植物园。这是...</td>\n",
       "      <td>00fd7bd9-6dcd-3896-a11a-9f2f2935719e</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12 年 拍摄 ， 拿到 第一步 单反 后 留下 的 照片 。 值得纪念 ， 那次 走野路 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269990</th>\n",
       "      <td>算是一个景点，长白山天池水的出口。水量极大，一直这么流，天池水还那么多，真的奇。至于瀑布本身...</td>\n",
       "      <td>db16ef5b-808a-3156-8651-9fdc9e1df664</td>\n",
       "      <td>NaN</td>\n",
       "      <td>算是 一个 景点 ， 长白山天池 水 的 出口 。 水量 极大 ， 一直 这么 流 ， 天池...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269991</th>\n",
       "      <td>黄山的景色很美，票价也可以接受</td>\n",
       "      <td>df17c87e-fb8e-3e30-891f-2becdb4eb335</td>\n",
       "      <td>NaN</td>\n",
       "      <td>黄山 的 景色 很 美 ， 票价 也 可以 接受</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269992</th>\n",
       "      <td>也就看看天涯海角</td>\n",
       "      <td>4645e289-2bbc-360e-a204-ef3ada0a7d61</td>\n",
       "      <td>NaN</td>\n",
       "      <td>也 就 看看 天涯海角</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269993</th>\n",
       "      <td>天一阁半日游</td>\n",
       "      <td>23981e76-6173-35b4-8b7b-75f0f15f1c1d</td>\n",
       "      <td>NaN</td>\n",
       "      <td>天一阁 半日 游</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269994</th>\n",
       "      <td>景区逛起来不错，人少，慢慢溜达，让孩子玩玩水。整个行程大概2个小时。</td>\n",
       "      <td>8dec9068-ca74-32c1-92cc-aca4deb574af</td>\n",
       "      <td>NaN</td>\n",
       "      <td>景区 逛起来 不错 ， 人少 ， 慢慢 溜达 ， 让 孩子 玩玩 水 。 整个 行程 大概 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269995</th>\n",
       "      <td>感觉景色一般，玩的漂流还可以。</td>\n",
       "      <td>cf44f035-ad85-35db-a682-82238d4ad824</td>\n",
       "      <td>NaN</td>\n",
       "      <td>感觉 景色 一般 ， 玩 的 漂流 还 可以 。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269996</th>\n",
       "      <td>泰山风景区不错，就是太难上;了，建议坐索道</td>\n",
       "      <td>694536a0-3b9a-3670-824f-c3bff9d510fc</td>\n",
       "      <td>NaN</td>\n",
       "      <td>泰山 风景区 不错 ， 就是 太难 上 ; 了 ， 建议 坐 索道</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269997</th>\n",
       "      <td>在井冈山革命斗争时期，象山庵曾是红军的重要活动场所，如红四军后方留守所、红四军机炮连、湘赣边...</td>\n",
       "      <td>f0f9bc5e-caab-33a9-9c55-c79ad25a4354</td>\n",
       "      <td>NaN</td>\n",
       "      <td>在 井冈山 革命斗争 时期 ， 象山 庵 曾 是 红军 的 重要 活动场所 ， 如 红四军 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269998</th>\n",
       "      <td>只去了动物园，欢乐世界，鳄鱼公园，还看了大马戏，遗憾的是冬天水上世界不开，欢乐世界特别好玩特...</td>\n",
       "      <td>fd321d66-7947-31eb-b940-d1751da3d3e3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>只 去 了 动物园 ， 欢乐 世界 ， 鳄鱼 公园 ， 还 看 了 大 马戏 ， 遗憾 的 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269999</th>\n",
       "      <td>带着儿子和老妈游方特</td>\n",
       "      <td>5a6aa35a-e877-3f9e-8a28-91d801c2c71a</td>\n",
       "      <td>NaN</td>\n",
       "      <td>带 着 儿子 和 老妈 游方 特</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>270000 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  Discuss  \\\n",
       "0                   好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的   \n",
       "1                         新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！   \n",
       "2                     庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去   \n",
       "3       个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...   \n",
       "4                                                  迪斯尼一日游   \n",
       "5                                                      方便   \n",
       "6       看水看山都可以。感受古人的智慧结晶，秋景美丽如画，红黄绿相间！对于身体状况不佳的人来说，走平...   \n",
       "7                                                       赞   \n",
       "8                                                    唯一糟点   \n",
       "9                                                   周未周边游   \n",
       "10      景点服务不错，就是排队 太长了，好玩的项目都是人，晚上的烟火一定jrvytqlamf要看，真...   \n",
       "11                                                绍兴护城河夜游   \n",
       "12                                      感觉还不错，作为一日游不错的选择～   \n",
       "13                                             有趣hai xing   \n",
       "14                     荡气回肠，10年去的，居然没有留下来照片，必然要再去！<br />n   \n",
       "15       景色超级棒，有美丽的故事，可以乘船游览，也可以沿湖浏览，累了可以乘坐观光车！关键是没有门票！！！   \n",
       "16      南锣鼓巷是北京市中心一条老胡同，因为其地理位置靠近什刹海，成为北京休闲娱乐的好去处，特别是外...   \n",
       "17                          个人感觉就是个卖小商品的地方，还不便宜，但是晚上夜景挺好看   \n",
       "18                                                  性价比超高   \n",
       "19                             挺普通的吧，就在楼下拍了几张图片，反正也是进不去的呵   \n",
       "20                 太大了，在里面走了好长时间也就看了不到五分之一。但周围交通方便，值得去看一看   \n",
       "21                                                    迪士尼   \n",
       "22                                                    亲子游   \n",
       "23                   来苏州总是要欣赏一下古典园林的。可惜对园林不太感冒。逛逛玩玩还是不错的。   \n",
       "24                        不到长城非好汉，对于爬过华山的我来说，长城太简单了，值得一去。   \n",
       "25                                                很值得去的地方   \n",
       "26                                                第一次必到景点   \n",
       "27                                                 好歹也是长城   \n",
       "28      早上一大早就起床去看升国旗，很庄严，很整齐，就是像一个节目让人转不开眼睛，特别是老辈的人听说...   \n",
       "29                                  登顶是俯瞰故宫的绝佳之处，崇祯帝在此自缢！   \n",
       "...                                                   ...   \n",
       "269970                                               风景优美   \n",
       "269971  鼓浪屿是我在国内玩过的景点里比较不错的一个  去厦门是一定要去鼓浪屿的  有美食一条街 还有...   \n",
       "269972                                    这个没啥看头，就是会动而已。。   \n",
       "269973  灵山正如它的名字一样，令人身临其境感到一股灵气，林中鲜脆的树叶及悦耳的鸟叫，让人感觉身心放松...   \n",
       "269974                                      很好的景区，就是天公不作美   \n",
       "269975                                   感觉还可以，也就是进去看个景色。   \n",
       "269976                                     看到了上海的地标建筑，很开心   \n",
       "269977                      因为宝对迪士尼的大部分经典动画都了解，所以整个体验相当不错   \n",
       "269978      很雄伟的宫殿，因为拉萨海拔太高了，没有选择爬上去，下次再来争取能爬上去看看，据说里面也很大   \n",
       "269979                                              不错挺方便   \n",
       "269980                   景区内十大绝景之一，外形非常之像！还有美丽的爱情故事，令人神往！   \n",
       "269981                     还算不错的，很热闹。每天都会有，晚上场比较震撼<br />n   \n",
       "269982                    用心去感受这皇家园林博物馆，如果要去细细的品味须要一天的时间。   \n",
       "269983  之前东湖是我国第一大城中湖，不过现在是第二大了，武汉汤逊湖现在是第一大城中湖。东湖也是世界三...   \n",
       "269984  去的时候刚好在检修 整个大雁塔都被包住了而且不能登上去 光以一个景区来说 并不是那么那么的惊...   \n",
       "269985                                 景色宜人环境优美价格便宜服务周到值。   \n",
       "269986                      门票真的很方便，都不用排队了，到那就可以玩啦，景色真的很好   \n",
       "269987                                               适合亲子   \n",
       "269988                                              迪士尼之行   \n",
       "269989  12年拍摄，拿到第一步单反后留下的照片。值得纪念，那次走野路爬了香山，从山上下到植物园。这是...   \n",
       "269990  算是一个景点，长白山天池水的出口。水量极大，一直这么流，天池水还那么多，真的奇。至于瀑布本身...   \n",
       "269991                                    黄山的景色很美，票价也可以接受   \n",
       "269992                                           也就看看天涯海角   \n",
       "269993                                             天一阁半日游   \n",
       "269994                 景区逛起来不错，人少，慢慢溜达，让孩子玩玩水。整个行程大概2个小时。   \n",
       "269995                                    感觉景色一般，玩的漂流还可以。   \n",
       "269996                              泰山风景区不错，就是太难上;了，建议坐索道   \n",
       "269997  在井冈山革命斗争时期，象山庵曾是红军的重要活动场所，如红四军后方留守所、红四军机炮连、湘赣边...   \n",
       "269998  只去了动物园，欢乐世界，鳄鱼公园，还看了大马戏，遗憾的是冬天水上世界不开，欢乐世界特别好玩特...   \n",
       "269999                                         带着儿子和老妈游方特   \n",
       "\n",
       "                                          Id  Score  \\\n",
       "0       201e8bf2-77a2-3a98-9fcf-4ce03914e712    5.0   \n",
       "1       f4d51947-eac4-3005-9d3c-2f32d6068a2d    4.0   \n",
       "2       74aa7ae4-03a4-394c-bee0-5702d3a3082a    4.0   \n",
       "3       099661c2-4360-3c49-a2fe-8c783764f7db    5.0   \n",
       "4       97ca672d-e558-3542-ba7b-ee719bba1bab    5.0   \n",
       "5       3b7f3f2e-886f-3a68-a810-2c37cfd728d3    4.0   \n",
       "6       88914409-bd13-3d47-b5a2-691177dde8fd    4.0   \n",
       "7       bf13ec92-6079-3451-ade3-88020cb0dcb5    5.0   \n",
       "8       489c3d94-9c44-3cf2-949c-1b507c374c69    5.0   \n",
       "9       285bba78-16a3-3c1d-b648-baa483883ee3    5.0   \n",
       "10      e7801d96-73d0-35c4-9e00-cc15caaa384a    5.0   \n",
       "11      973afeca-7530-3f56-b7f5-bef36d889025    4.0   \n",
       "12      cd91dc2f-2331-3c73-bc8d-da027337270d    5.0   \n",
       "13      7ce97eca-63a8-30a1-9687-6796f34606f1    5.0   \n",
       "14      25e21097-bd41-3589-b12c-62bc7b04eb6d    5.0   \n",
       "15      98e78de7-d5d3-3b30-90d4-a63a6107d532    5.0   \n",
       "16      26334fc8-a4f1-3dc3-adb6-76b99d75cdf9    5.0   \n",
       "17      7f4d6d59-f732-3125-8e7d-8bd64c891b94    3.0   \n",
       "18      61522e3c-5d2a-3088-b60d-159dbc2976ce    5.0   \n",
       "19      37e57244-8d7e-3a1d-8f0f-8b811afb4a6a    3.0   \n",
       "20      81502f08-b884-38b8-8169-7de7a0680a82    4.0   \n",
       "21      533a667c-d6ba-313d-bc29-588b992789e0    2.0   \n",
       "22      041e4056-62f2-3f25-8e3e-8f57f66cb3d8    5.0   \n",
       "23      988f2319-3292-305a-aaaf-ba18bc397e5a    4.0   \n",
       "24      c3d7dd21-79ef-3ff2-b90d-14631e9a30b4    5.0   \n",
       "25      4b12c7b9-059f-3016-a954-3849b0456ce4    5.0   \n",
       "26      5ba1fa45-4c97-3afe-9dd6-3efea9c73a94    5.0   \n",
       "27      f6d82a8c-ef72-3a0a-95ca-95aa2fbb7f7d    4.0   \n",
       "28      9363fc36-92a7-371f-8d5f-5dd71b565455    5.0   \n",
       "29      a4dc34f1-6a97-3b86-829c-466cdaa86bf2    4.0   \n",
       "...                                      ...    ...   \n",
       "269970  75e3f467-ee98-3dcc-812f-72f7ab6bf80c    NaN   \n",
       "269971  78039063-aa5d-34ab-a1a6-00d49ff6a16f    NaN   \n",
       "269972  f8aec5d7-b10c-3005-891a-3e353765cea0    NaN   \n",
       "269973  a056a723-7270-3bf3-ba48-b590d2367f33    NaN   \n",
       "269974  fb7dfd18-b884-3bb0-88da-f6a91aa2908e    NaN   \n",
       "269975  61c44262-2ba0-3764-b2fa-034afa11ba63    NaN   \n",
       "269976  168ffccc-97f2-3292-89f1-b5e69e2b0342    NaN   \n",
       "269977  11d09de4-6b44-384e-b2ab-e09c70ccb7d2    NaN   \n",
       "269978  ba3329d8-5fc2-3b70-bd63-131797230176    NaN   \n",
       "269979  f3d2ed80-f677-3d2b-b06d-d2b627ecb89e    NaN   \n",
       "269980  09d81155-92f8-3a1b-a96c-18a60ddb0af5    NaN   \n",
       "269981  180e945e-7b9a-3926-a0d5-c5d37adc6397    NaN   \n",
       "269982  48f19b81-9830-342c-9b56-5d16461aaf59    NaN   \n",
       "269983  d3e78b57-56f3-3477-8212-6375541c4c52    NaN   \n",
       "269984  4aebdab0-77b1-3c66-9073-d6b620218825    NaN   \n",
       "269985  e6eb2ce4-08ec-3574-915c-fd9086a84c74    NaN   \n",
       "269986  79060b6f-647c-35a7-899e-72aba4d1a40d    NaN   \n",
       "269987  e4522edd-00dc-3499-a03c-1e37bdb13934    NaN   \n",
       "269988  4a9c2aa1-7d33-3c9b-a767-e0deff7b82dd    NaN   \n",
       "269989  00fd7bd9-6dcd-3896-a11a-9f2f2935719e    NaN   \n",
       "269990  db16ef5b-808a-3156-8651-9fdc9e1df664    NaN   \n",
       "269991  df17c87e-fb8e-3e30-891f-2becdb4eb335    NaN   \n",
       "269992  4645e289-2bbc-360e-a204-ef3ada0a7d61    NaN   \n",
       "269993  23981e76-6173-35b4-8b7b-75f0f15f1c1d    NaN   \n",
       "269994  8dec9068-ca74-32c1-92cc-aca4deb574af    NaN   \n",
       "269995  cf44f035-ad85-35db-a682-82238d4ad824    NaN   \n",
       "269996  694536a0-3b9a-3670-824f-c3bff9d510fc    NaN   \n",
       "269997  f0f9bc5e-caab-33a9-9c55-c79ad25a4354    NaN   \n",
       "269998  fd321d66-7947-31eb-b940-d1751da3d3e3    NaN   \n",
       "269999  5a6aa35a-e877-3f9e-8a28-91d801c2c71a    NaN   \n",
       "\n",
       "                                                  comment  \n",
       "0       好大 的 一个 游乐 公园 ， 已经 去 了 2 次 ， 但 感觉 还 没有 玩够 似的 ！...  \n",
       "1       新 中国 成立 也 是 在 这 举行 ， 对 我们 中国 人 来说 有些 重要 及 深刻 的...  \n",
       "2       庐山 瀑布 非常 有名 ， 也 有 非常 多个 瀑布 ， 只是 最 好看 的 非 三叠 泉莫...  \n",
       "3       个人 觉得 颐和园 是 北京 最值 的 一起 的 地方 ， 不过 相比 下 门票 也 是 最...  \n",
       "4                                                 迪斯尼 一日游  \n",
       "5                                                      方便  \n",
       "6       看水 看山 都 可以 。 感受 古人 的 智慧结晶 ， 秋景 美丽 如画 ， 红黄绿 相间 ...  \n",
       "7                                                       赞  \n",
       "8                                                   唯一 糟点  \n",
       "9                                                  周未 周边游  \n",
       "10      景点 服务 不错 ， 就是 排队   太长 了 ， 好玩 的 项目 都 是 人 ， 晚上 的...  \n",
       "11                                              绍兴 护城河 夜游  \n",
       "12                             感觉 还 不错 ， 作为 一日游 不错 的 选择 ～  \n",
       "13                                          有趣 hai   xing  \n",
       "14      荡气回肠 ， 10 年 去 的 ， 居然 没有 留下来 照片 ， 必然 要 再 去 ！ < ...  \n",
       "15      景色 超级 棒 ， 有 美丽 的 故事 ， 可以 乘船 游览 ， 也 可以 沿湖 浏览 ， ...  \n",
       "16      南锣鼓巷 是 北京市 中心 一条 老 胡同 ， 因为 其 地理位置 靠近 什刹海 ， 成为 ...  \n",
       "17          个人感觉 就是 个 卖 小商品 的 地方 ， 还 不 便宜 ， 但是 晚上 夜景 挺 好看  \n",
       "18                                                 性价比 超高  \n",
       "19           挺 普通 的 吧 ， 就 在 楼下 拍 了 几张 图片 ， 反正 也 是 进不去 的 呵  \n",
       "20      太大 了 ， 在 里面 走 了 好长时间 也 就 看 了 不到 五分之一 。 但 周围 交通...  \n",
       "21                                                    迪士尼  \n",
       "22                                                   亲子 游  \n",
       "23      来 苏州 总是 要 欣赏 一下 古典 园林 的 。 可惜 对 园林 不太 感冒 。 逛逛 玩...  \n",
       "24       不到长城非好汉 ， 对于 爬 过 华山 的 我 来说 ， 长城 太 简单 了 ， 值得 一去 。  \n",
       "25                                            很 值得 去 的 地方  \n",
       "26                                              第一次 必到 景点  \n",
       "27                                              好歹 也 是 长城  \n",
       "28      早上 一大早 就 起床 去 看 升国旗 ， 很 庄严 ， 很 整齐 ， 就是 像 一个 节目...  \n",
       "29                     登顶 是 俯瞰 故宫 的 绝佳 之 处 ， 崇祯帝 在 此 自缢 ！  \n",
       "...                                                   ...  \n",
       "269970                                               风景优美  \n",
       "269971  鼓浪屿 是 我 在 国内 玩过 的 景点 里 比较 不错 的 一个     去 厦门 是 一...  \n",
       "269972                          这个 没 啥 看头 ， 就是 会 动 而已 。 。  \n",
       "269973  灵山 正如 它 的 名字 一样 ， 令人 身临其境 感到 一股 灵气 ， 林中 鲜脆 的 树...  \n",
       "269974                                很 好 的 景区 ， 就是 天公不作美  \n",
       "269975                          感觉 还 可以 ， 也 就是 进去 看个 景色 。  \n",
       "269976                             看到 了 上海 的 地标 建筑 ， 很 开心  \n",
       "269977        因为 宝对 迪士尼 的 大部分 经典 动画 都 了解 ， 所以 整个 体验 相当 不错  \n",
       "269978  很 雄伟 的 宫殿 ， 因为 拉萨 海拔 太高 了 ， 没有 选择 爬上去 ， 下次 再 来...  \n",
       "269979                                            不错 挺 方便  \n",
       "269980   景区 内 十大 绝景 之一 ， 外形 非常 之像 ！ 还有 美丽 的 爱情故事 ， 令人神往 ！  \n",
       "269981  还 算 不错 的 ， 很 热闹 。 每天 都 会 有 ， 晚上 场 比较 震撼 < br  ...  \n",
       "269982   用心 去 感受 这 皇家 园林 博物馆 ， 如果 要 去 细细的 品味 须要 一天 的 时间 。  \n",
       "269983  之前 东湖 是 我国 第一 大 城中 湖 ， 不过 现在 是 第二 大 了 ， 武汉 汤逊湖...  \n",
       "269984  去 的 时候 刚好 在 检修   整个 大雁塔 都 被 包住 了 而且 不能 登上 去   ...  \n",
       "269985                            景色宜人 环境优美 价格便宜 服务周到 值 。  \n",
       "269986  门票 真的 很 方便 ， 都 不用 排队 了 ， 到 那 就 可以 玩 啦 ， 景色 真的 很 好  \n",
       "269987                                              适合 亲子  \n",
       "269988                                            迪士尼 之 行  \n",
       "269989  12 年 拍摄 ， 拿到 第一步 单反 后 留下 的 照片 。 值得纪念 ， 那次 走野路 ...  \n",
       "269990  算是 一个 景点 ， 长白山天池 水 的 出口 。 水量 极大 ， 一直 这么 流 ， 天池...  \n",
       "269991                           黄山 的 景色 很 美 ， 票价 也 可以 接受  \n",
       "269992                                        也 就 看看 天涯海角  \n",
       "269993                                           天一阁 半日 游  \n",
       "269994  景区 逛起来 不错 ， 人少 ， 慢慢 溜达 ， 让 孩子 玩玩 水 。 整个 行程 大概 ...  \n",
       "269995                           感觉 景色 一般 ， 玩 的 漂流 还 可以 。  \n",
       "269996                  泰山 风景区 不错 ， 就是 太难 上 ; 了 ， 建议 坐 索道  \n",
       "269997  在 井冈山 革命斗争 时期 ， 象山 庵 曾 是 红军 的 重要 活动场所 ， 如 红四军 ...  \n",
       "269998  只 去 了 动物园 ， 欢乐 世界 ， 鳄鱼 公园 ， 还 看 了 大 马戏 ， 遗憾 的 ...  \n",
       "269999                                   带 着 儿子 和 老妈 游方 特  \n",
       "\n",
       "[270000 rows x 4 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(ngram_range=(1,2),token_pattern=r'(?u)\\b\\w+\\b', min_df=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x_comment = vectorizer.fit_transform(df_date[\"comment\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<270000x1849617 sparse matrix of type '<class 'numpy.int64'>'\n",
       "\twith 11548517 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_comment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x_t = x_comment[:df_train.shape[0]]\n",
    "x_test = x_comment[df_train.shape[0]:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<220000x1849617 sparse matrix of type '<class 'numpy.int64'>'\n",
       "\twith 9348032 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y = df_train[\"Score\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x_train, x_val, y_train, y_val = train_test_split(x_t, y, test_size = 0.1, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "reg = Lasso(alpha = 0.01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,\n",
       "   normalize=False, positive=False, precompute=False, random_state=None,\n",
       "   selection='cyclic', tol=0.0001, warm_start=False)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reg.fit(x_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred = reg.predict(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import datetime\n",
    "time = datetime.datetime.now()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sub = pd.read_csv(INPUT_PATH + 'sample.csv',header=None,names=['Id','Score'])\n",
    "sub['Score'] = y_pred\n",
    "sub.to_csv(OUTPUT_PATH + 'lasso_{}.csv'.format(time.strftime('%Y-%m-%d-%H:%M:%S')),index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_vectorizer = TfidfVectorizer(ngram_range=(1, 4),max_features=50000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_tfidf = word_vectorizer.fit_transform(df_date[\"comment\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x_t = x_comment[:df_train.shape[0]]\n",
    "x_test = x_comment[df_train.shape[0]:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x_train, x_val, y_train, y_val = train_test_split(x_t, y, test_size = 0.1, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "reg_tfidf = Lasso(alpha = 0.01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,\n",
       "   normalize=False, positive=False, precompute=False, random_state=None,\n",
       "   selection='cyclic', tol=0.0001, warm_start=False)"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reg_tfidf.fit(x_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def round_score(score):\n",
    "    if score>4.7:\n",
    "        score = 5\n",
    "    if 3.8 < score < 4:\n",
    "        score= 4\n",
    "    if score<1:\n",
    "        score= 1\n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_tfidf = reg_tfidf.predict(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_tfidf[y_tfidf >4.6 ] = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import datetime\n",
    "time = datetime.datetime.now()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub = pd.read_csv(INPUT_PATH + 'sample.csv',header=None,names=['Id','Score'])\n",
    "sub['Score'] = y_tfidf\n",
    "sub.to_csv(OUTPUT_PATH + 'lasso_{}.csv'.format(time.strftime('%Y-%m-%d-%H:%M:%S')),index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>119091b9-c9da-35a8-b82f-222a78bd1302</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>134</th>\n",
       "      <td>7efb9895-8721-34d3-8a43-785fc4e9cf88</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>240</th>\n",
       "      <td>81d2b601-7cca-3658-9ab1-aa1b366cbc00</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>265</th>\n",
       "      <td>74339c96-d4eb-396b-ac15-492bcd7196ed</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>306</th>\n",
       "      <td>cbdf031a-21b6-304e-95f8-fd40f22c4845</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>362</th>\n",
       "      <td>f660a3e8-e07c-3cc7-b53a-166183d41802</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>395</th>\n",
       "      <td>727925b6-1738-3d9a-8f49-98ba01959708</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>475</th>\n",
       "      <td>f0670e8d-2a60-3be3-99d5-0b539db18cca</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>491</th>\n",
       "      <td>d8abc748-2cd7-3db0-81bc-792c6093792b</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>617</th>\n",
       "      <td>c1d219a4-78bc-3b30-9b03-90ce2e4ce0d0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>637</th>\n",
       "      <td>09700d52-dedb-3dac-974e-716f50be10e7</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>673</th>\n",
       "      <td>ac0665ec-47b2-32d2-95c6-f94e8cde5e64</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>820</th>\n",
       "      <td>704c51f9-a2f8-34dd-9f70-c52d06b307e6</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>876</th>\n",
       "      <td>b6265411-76f2-3628-b17d-f4dc8a57655e</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>883</th>\n",
       "      <td>6164c994-0520-38a9-9763-bf01858eb739</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>962</th>\n",
       "      <td>7bb92168-b817-3917-a25a-308b3d7a11b8</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1013</th>\n",
       "      <td>67fa5cc2-f184-382e-9f9b-ee96316e057a</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1103</th>\n",
       "      <td>ad96d728-56ce-3a3e-a174-7e564c9abe06</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1234</th>\n",
       "      <td>c27c4b76-292c-3408-9985-1dacab95ee06</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1240</th>\n",
       "      <td>1dcc1555-f504-3d3c-b995-5fbebefde75e</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1385</th>\n",
       "      <td>8b9a9018-c2ee-3c84-bbaf-f20e025b0dff</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1392</th>\n",
       "      <td>b10e9fcc-a168-3c8d-949e-253be62ccbf3</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1552</th>\n",
       "      <td>e77d9c59-c5d1-33b7-ac36-4b00a57a2f01</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1617</th>\n",
       "      <td>a8232897-f482-3e27-b487-27021c613c7b</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1623</th>\n",
       "      <td>3f0cc317-33e1-3881-ad95-af16afd65fb0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1706</th>\n",
       "      <td>5782ddde-31bf-357d-a37d-4fb835ae7504</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1714</th>\n",
       "      <td>0cfb683c-8150-3282-ab30-5a3306b43379</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1749</th>\n",
       "      <td>f2fbb030-4616-32d6-9da4-469d6493cb8b</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1773</th>\n",
       "      <td>625a1e2d-fb35-3c4a-ac11-f7a7ddfacdda</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1828</th>\n",
       "      <td>554cc0df-45fa-3c36-9448-1dcfccf79858</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48208</th>\n",
       "      <td>5abecf96-54a2-3ed9-a867-9fe61cde0b1c</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48272</th>\n",
       "      <td>0958cbdf-5477-3afe-9895-1d325b1c60b2</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48276</th>\n",
       "      <td>375b7216-7b85-30d1-bf0a-e897aa3a27d2</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48496</th>\n",
       "      <td>aa989473-2f49-3ec1-8c7d-77aa984dd04d</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48606</th>\n",
       "      <td>10c9f33f-4cdf-35c6-b8d8-eaeb39bc4122</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48688</th>\n",
       "      <td>188e7850-bc9c-3111-9bc6-8329751a347c</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48748</th>\n",
       "      <td>8a4fa89a-552d-3f8c-8a83-ae5fc1df256a</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48787</th>\n",
       "      <td>b6a8ba8d-7189-3291-bd74-42b99301fd86</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48807</th>\n",
       "      <td>a86a2558-e394-31d8-9136-ace0c4c0b114</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49031</th>\n",
       "      <td>04ed7490-dcde-3283-87e6-2011776e90ac</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49118</th>\n",
       "      <td>0f250664-8056-32d7-8eb2-b8238b459b03</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49149</th>\n",
       "      <td>8986c6dd-3fe6-3ba2-8380-ee095af7cbd0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49206</th>\n",
       "      <td>c04d8ba7-74a9-3a06-ade2-713b35b23e6a</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49389</th>\n",
       "      <td>69c6fced-1fb0-300c-852b-6e95c64d098c</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49430</th>\n",
       "      <td>cbbb1b94-4ff0-3394-9303-d32f50b53b51</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49435</th>\n",
       "      <td>165dfb15-e3a5-3cdc-a7e5-226f74f079ac</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49444</th>\n",
       "      <td>fb747ecc-4b56-3b43-bcf4-48ab87fd52d1</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49474</th>\n",
       "      <td>00b8f7eb-d6bd-3e50-b82a-6e05b3d58730</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49477</th>\n",
       "      <td>ebfd138f-ceb6-3feb-bd2d-66e15a8f933c</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49589</th>\n",
       "      <td>68f89bf0-5f53-35b7-bdff-1ce1c155f5d1</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49591</th>\n",
       "      <td>06586b53-026d-311e-bb01-d4dc49e130d5</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49603</th>\n",
       "      <td>55856863-33d8-3e40-b981-ddcd058787ba</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49636</th>\n",
       "      <td>8cdbb514-179b-3fb6-9f66-4ffdfa1e965d</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49669</th>\n",
       "      <td>6c371d95-e01e-30e0-91fd-49d3ba91c459</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49787</th>\n",
       "      <td>eb44972b-6119-3c25-8eba-0e0fab98ebbf</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49890</th>\n",
       "      <td>374ece67-b45a-38e5-8029-77ca2897606e</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49913</th>\n",
       "      <td>8747143d-5aae-3ce7-b4b3-8a484eef1097</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49925</th>\n",
       "      <td>8d7bb8f7-9090-3655-b604-2ae0d08ad5ef</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49928</th>\n",
       "      <td>f5a7ae46-5e6e-38aa-a2fc-8515b6239e82</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49949</th>\n",
       "      <td>3afb3271-e0b2-3bb9-812a-edd9d2037200</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>871 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         Id  Score\n",
       "110    119091b9-c9da-35a8-b82f-222a78bd1302    5.0\n",
       "134    7efb9895-8721-34d3-8a43-785fc4e9cf88    5.0\n",
       "240    81d2b601-7cca-3658-9ab1-aa1b366cbc00    5.0\n",
       "265    74339c96-d4eb-396b-ac15-492bcd7196ed    5.0\n",
       "306    cbdf031a-21b6-304e-95f8-fd40f22c4845    5.0\n",
       "362    f660a3e8-e07c-3cc7-b53a-166183d41802    5.0\n",
       "395    727925b6-1738-3d9a-8f49-98ba01959708    5.0\n",
       "475    f0670e8d-2a60-3be3-99d5-0b539db18cca    5.0\n",
       "491    d8abc748-2cd7-3db0-81bc-792c6093792b    5.0\n",
       "617    c1d219a4-78bc-3b30-9b03-90ce2e4ce0d0    5.0\n",
       "637    09700d52-dedb-3dac-974e-716f50be10e7    5.0\n",
       "673    ac0665ec-47b2-32d2-95c6-f94e8cde5e64    5.0\n",
       "820    704c51f9-a2f8-34dd-9f70-c52d06b307e6    5.0\n",
       "876    b6265411-76f2-3628-b17d-f4dc8a57655e    5.0\n",
       "883    6164c994-0520-38a9-9763-bf01858eb739    5.0\n",
       "962    7bb92168-b817-3917-a25a-308b3d7a11b8    5.0\n",
       "1013   67fa5cc2-f184-382e-9f9b-ee96316e057a    5.0\n",
       "1103   ad96d728-56ce-3a3e-a174-7e564c9abe06    5.0\n",
       "1234   c27c4b76-292c-3408-9985-1dacab95ee06    5.0\n",
       "1240   1dcc1555-f504-3d3c-b995-5fbebefde75e    5.0\n",
       "1385   8b9a9018-c2ee-3c84-bbaf-f20e025b0dff    5.0\n",
       "1392   b10e9fcc-a168-3c8d-949e-253be62ccbf3    5.0\n",
       "1552   e77d9c59-c5d1-33b7-ac36-4b00a57a2f01    5.0\n",
       "1617   a8232897-f482-3e27-b487-27021c613c7b    5.0\n",
       "1623   3f0cc317-33e1-3881-ad95-af16afd65fb0    5.0\n",
       "1706   5782ddde-31bf-357d-a37d-4fb835ae7504    5.0\n",
       "1714   0cfb683c-8150-3282-ab30-5a3306b43379    5.0\n",
       "1749   f2fbb030-4616-32d6-9da4-469d6493cb8b    5.0\n",
       "1773   625a1e2d-fb35-3c4a-ac11-f7a7ddfacdda    5.0\n",
       "1828   554cc0df-45fa-3c36-9448-1dcfccf79858    5.0\n",
       "...                                     ...    ...\n",
       "48208  5abecf96-54a2-3ed9-a867-9fe61cde0b1c    5.0\n",
       "48272  0958cbdf-5477-3afe-9895-1d325b1c60b2    5.0\n",
       "48276  375b7216-7b85-30d1-bf0a-e897aa3a27d2    5.0\n",
       "48496  aa989473-2f49-3ec1-8c7d-77aa984dd04d    5.0\n",
       "48606  10c9f33f-4cdf-35c6-b8d8-eaeb39bc4122    5.0\n",
       "48688  188e7850-bc9c-3111-9bc6-8329751a347c    5.0\n",
       "48748  8a4fa89a-552d-3f8c-8a83-ae5fc1df256a    5.0\n",
       "48787  b6a8ba8d-7189-3291-bd74-42b99301fd86    5.0\n",
       "48807  a86a2558-e394-31d8-9136-ace0c4c0b114    5.0\n",
       "49031  04ed7490-dcde-3283-87e6-2011776e90ac    5.0\n",
       "49118  0f250664-8056-32d7-8eb2-b8238b459b03    5.0\n",
       "49149  8986c6dd-3fe6-3ba2-8380-ee095af7cbd0    5.0\n",
       "49206  c04d8ba7-74a9-3a06-ade2-713b35b23e6a    5.0\n",
       "49389  69c6fced-1fb0-300c-852b-6e95c64d098c    5.0\n",
       "49430  cbbb1b94-4ff0-3394-9303-d32f50b53b51    5.0\n",
       "49435  165dfb15-e3a5-3cdc-a7e5-226f74f079ac    5.0\n",
       "49444  fb747ecc-4b56-3b43-bcf4-48ab87fd52d1    5.0\n",
       "49474  00b8f7eb-d6bd-3e50-b82a-6e05b3d58730    5.0\n",
       "49477  ebfd138f-ceb6-3feb-bd2d-66e15a8f933c    5.0\n",
       "49589  68f89bf0-5f53-35b7-bdff-1ce1c155f5d1    5.0\n",
       "49591  06586b53-026d-311e-bb01-d4dc49e130d5    5.0\n",
       "49603  55856863-33d8-3e40-b981-ddcd058787ba    5.0\n",
       "49636  8cdbb514-179b-3fb6-9f66-4ffdfa1e965d    5.0\n",
       "49669  6c371d95-e01e-30e0-91fd-49d3ba91c459    5.0\n",
       "49787  eb44972b-6119-3c25-8eba-0e0fab98ebbf    5.0\n",
       "49890  374ece67-b45a-38e5-8029-77ca2897606e    5.0\n",
       "49913  8747143d-5aae-3ce7-b4b3-8a484eef1097    5.0\n",
       "49925  8d7bb8f7-9090-3655-b604-2ae0d08ad5ef    5.0\n",
       "49928  f5a7ae46-5e6e-38aa-a2fc-8515b6239e82    5.0\n",
       "49949  3afb3271-e0b2-3bb9-812a-edd9d2037200    5.0\n",
       "\n",
       "[871 rows x 2 columns]"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub[sub['Score']==5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f74ae157470>"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD6CAYAAACoCZCsAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAADUZJREFUeJzt3VGMXOdZgOH3q50gs0tTV45WhAvc\nSJZQRWJIVsERbplFTuUQhKoIKS2oKCrVXjSCC3qBUYNQSgsUQUUFKWhFqEOk1gpFoIJbEhVliFUc\niLeFODcRgjpVTapSOdhaA4VEHxdz4h07a+/ZmZ05m/3eR1r57L/H9j9/Ts7rM3N2NjITSVI9b+p6\nApKkbhgASSrKAEhSUQZAkooyAJJUlAGQpKIMgCQVZQAkqSgDIElF7ex6AteyZ8+e3Lt3b9fT2FQX\nL15kZmam62lsCa7FgOuwyrVYNc5aLC8vfzszb1xvvy0dgL1793Lq1Kmup7Gp+v0+vV6v62lsCa7F\ngOuwyrVYNc5aRMSLbfbzKSBJKsoASFJRBkCSijIAklSUAZCkogyAJBVlACSpKAMgSUUZAEkqakt/\nJ7Ck7W/vkeOXto8e9m0gpskrAEkqygBIUlEGQJKKMgCSVJQBkKSiDIAkFWUAJKkoAyBJRRkASSrK\nAEhSUQZAkooyAJJUlAGQpKIMgCQVZQAkqSgDIElFGQBJKmrdAETEDRHxxYh4MiL+IiKuj4hHIuJk\nRDw4tN/IY5Kk6WtzBfCzwCcy813AN4H3ADsy807g5ojYFxH3jjo2mYclSVpPZGb7nSM+B7wZ+L3M\n/EJEvAfYBfww8DejjGXmp6/4OxaBRYC5ubnbjx07Nv6j3EJWVlaYnZ3tehpbgmsxUH0dTp89f2n7\nbTfsKL0Ww8Y5LhYWFpYzc369/Vr/UPiIuBPYDZwBzjbD54DbgJkxxi6TmUvAEsD8/Hz2er22U3xD\n6Pf7bLfHNCrXYqD6Otx/xQ+Fr7wWw6ZxXLR6ETgi3gr8PvB+YIXBv+YBZps/Y5wxSVIH2rwIfD3w\nZ8CvZOaLwDJwsPnyfgZXBOOMSZI60OYpoJ9n8FTNhyPiw8CngfdFxE3A3cABIIETI45Jkjqw7hVA\nZv5hZu7OzF7z8SjQA54BFjLzfGZeGHVsEg9KkrS+1i8CD8vMl4HHN2tMkjR9vggrSUUZAEkqygBI\nUlEGQJKKMgCSVJQBkKSiDIAkFWUAJKkoAyBJRRkASSrKAEhSUQZAkooyAJJUlAGQpKIMgCQVZQAk\nqSgDIElFGQBJKsoASFJRBkCSijIAklSUAZCkogyAJBVlACSpKAMgSUUZAEkqygBIUlEGQJKKMgCS\nVJQBkKSiDIAkFWUAJKkoAyBJRRkASSrKAEhSUQZAkooyAJJUVKsARMRcRJxotr8vIr4REf3m48Zm\n/JGIOBkRDw79vlZjkqTpWzcAEbEbeBSYaYZ+BPhYZvaaj/+IiHuBHZl5J3BzROxrOzaZhyVJWk+b\nK4BXgfuAC83nB4APRMRXIuI3mrEe8Hiz/SRwcANjkqQO7Fxvh8y8ABARrw19Efh14L+AL0XErQyu\nDs42Xz8H3LaBsctExCKwCDA3N0e/39/gQ9raVlZWtt1jGpVrMVB9HT50yyuXtquvxbBprMW6AVjD\n32fmdwAi4qvAPmAF2NV8fZbBlUXbsctk5hKwBDA/P5+9Xm+EKW5d/X6f7faYRuVaDFRfh/uPHL+0\nffTwTOm1GDaN42KUu4CeiIjvjYjvBt4FPA8ss/p0zn7gzAbGJEkdGOUK4CHgKeB/gT/KzBci4iXg\nRETcBNzN4HWCbDkmSepA6yuAzOw1vz6VmT+Qmbdm5h80YxcYvMD7DLCQmefbjm3iY5EkbcAoVwBr\nysyXWb3DZ0NjkqTp8zuBJakoAyBJRRkASSrKAEhSUQZAkooyAJJUlAGQpKIMgCQVZQAkqSgDIElF\nGQBJKsoASFJRBkCSijIAklSUAZCkogyAJBVlACSpKAMgSUUZAEkqygBIUlEGQJKKMgCSVJQBkKSi\nDIAkFWUAJKkoAyBJRRkASSrKAEhSUQZAkooyAJJUlAGQpKIMgCQVZQAkqSgDIElFGQBJKsoASFJR\nBkCSijIAklRUqwBExFxEnGi2r4uIv4qIL0fE+8cdkyR1Y90ARMRu4FFgphn6BWA5M38U+OmI+J4x\nxyRJHWhzBfAqcB9wofm8BzzebD8NzI85JknqwM71dsjMCwAR8drQDHC22T4HzI05dpmIWAQWAebm\n5uj3+xt4OFvfysrKtntMo3ItBqqvw4dueeXSdvW1GDaNtVg3AGtYAXYB54HZ5vNxxi6TmUvAEsD8\n/Hz2er0Rprh19ft9tttjGpVrMVB9He4/cvzS9tHDM6XXYtg0jotR7gJaBg422/uBM2OOSZI6MMoV\nwKPAFyLiHcDbgX9g8LTOqGOSpA60vgLIzF7z64vAXcCXgUOZ+eo4Y5v6aCRJrY1yBUBm/jurd/OM\nPSZJmj6/E1iSijIAklSUAZCkogyAJBVlACSpKAMgSUUZAEkqygBIUlEGQJKKMgCSVJQBkKSiDIAk\nFWUAJKkoAyBJRRkASSrKAEhSUQZAkooyAJJUlAGQpKIMgCQVZQAkqSgDIElFGQBJKsoASFJRBkCS\nijIAklSUAZCkogyAJBVlACSpKAMgSUUZAEkqygBIUlEGQJKKMgCSVJQBkKSiDIAkFWUAJKkoAyBJ\nRW04ABGxMyK+HhH95uOWiHgoIp6NiIeH9ms1JknqxihXALcCn83MXmb2gOuBg8AdwLci4lBE3N5m\nbFMegSRpJJGZG/sNER8EHgAuAqeBF4CVzPxURBwA7gbOA/+z3lhm/toaf/4isAgwNzd3+7Fjx0Z/\ndFvQysoKs7OzXU9jS3AtBqqvw+mz5y9tv+2GHaXXYtg4x8XCwsJyZs6vt9/OEf7sZ4FDmflSRPwp\nsItBBADOAXPAK8C/thh7ncxcApYA5ufns9frjTDFravf77PdHtOoXIuBiuuw98jxoc9WT0NHD8+U\nW4urmcZxMUoAnsvM7zTbp4DrGEQAYJbB00orLcckSR0Z5ST8WETsj4gdwLuBGQbP7QPsB84Ayy3H\nJEkdGeUK4CPAZ4AAPg98FDgREZ8EDjcfLwK/2WJMktSRDQcgM59ncCfQJc0dPfcAn8zMr21kTJLU\njVGuAF4nM/8b+NwoY5KkbvhCrCQVtSlXAJK02YZvFT3zW/d0OJPtyysASSrKAEhSUQZAkooyAJJU\nlAGQpKIMgCQVZQAkqSgDIElFGQBJKsoASFJRBkCSijIAklSUbwYnaWI2+oZup8+e5/7Lfl6wJskr\nAEkqygBIUlEGQJKKMgCSVJQBkKSiDIAkFeVtoJKmYu8Yt3f684EnwwBI2lTjnOg1XT4FJElFeQUg\naWz+q/+NySsASSrKAEhSUQZAkooyAJJUlAGQpKK8C0jSG4rfFLZ5DICkkXjr5xufAZCkLWI4qkcP\nz0z87/M1AEkqyisASa34lM/24xWAJBXlFYCkq9rq/+r3jqDxdBKAiHgEeDtwPDM/2sUcJK1tq5/0\ntXmmHoCIuBfYkZl3RsSfRMS+zPyXac9Dqs4Tvbq4AugBjzfbTwIHAQMgteSJW5uliwDMAGeb7XPA\nbcNfjIhFYLH5dCUiXpji3KZhD/DtriexRbgWA65D4xfHWIv4+CZPpmMLHx/ruPj+Njt1EYAVYFez\nPcsVdyJl5hKwNO1JTUtEnMrM+a7nsRW4FgOuwyrXYtU01qKL20CXGTztA7AfONPBHCSpvC6uAP4S\nOBERNwF3Awc6mIMklTf1K4DMvMDgheBngIXMPD/tOXRs2z69NQLXYsB1WOVarJr4WkRmTvrvkCRt\nQb4VhCYqIt4aEXdFxJ6u5yLpcgZgQjzxQUTsBv4auAN4KiJuXGOfnRHx9YjoNx+3TH2i6kREzEXE\nV6/yNY+LKTAAE9DmxNfs90hEnIyIB6c6wem5FfilzPwY8ARXfM/H0D6fzcxe83F6qjOcsmud9Jqv\nb/djYtjvsHpL+JW2/XHRNnIR8VBEPBsRD2/2HAzAZKx74ht+Swzg5ojYN+U5Tlxm/l1mPhMR72QQ\nw5Nr7HYA+MmI+Mfm5Lfd36Dwqie9CsfEayLix4GLwDevskuF42LdyEXE7Qxum78D+FZEHNrMCRiA\nCWh54uvx+rfE2HYiIoD7gJeB/1tjl2eBQ5l5B3Ad8BNTnN5UtTjp9ahxTFwP/Cpw5Bq7VTgu2kTu\nx4A/z8HdOk8A79jMCRiACWlx4rvyLTHmpjS1qcqBB4DngJ9aY5fnMvOlZvsUsC3/1dvypFfimGCw\nBp/KzP+8xj4Vjos2kZvoMWEAJqTFie+ab4mxHUTEL0fEzzWfvgVY63/4xyJif0TsAN4N/PPUJjhd\nbU562/6YaBwCHoiIPvBDEfHHa+xT4bhoE7mJHhPb9QDrVMsTX4W3xFgC3hcRTwM7gG9ExJU//+Ej\nwGPAPwEnM/NLU57jtLQ56VU4JsjMd772vDeD/+6fKHpctIncRI8JvxFsApq7gB4Hvgt4HngYeG9m\nPji0z5uBE8Df0rwlRsHvii6picAHgZ/xmKgrIn4Q+AwQwOeB3wV+OzM/MLTPmxgcE6eAw8DhzPza\nps3BAHSnCcVdwNOZebUXBlWIx4SuFBG7gHuAr2Tmv23qn20AJKkmXwOQpKIMgCQVZQAkqSgDIElF\nGQBJKur/AYa2tWklRCvtAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f74a6444c50>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "sub[\"Score\"].hist(bins =100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
